1   package org.apache.lucene.search.spell;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.lucene.util.IntsRef;
21  
22  /**
23   *  Damerau-Levenshtein (optimal string alignment) implemented in a consistent 
24   *  way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
25   *  
26   *  Notes:
27   *  <ul>
28   *    <li> This metric treats full unicode codepoints as characters
29   *    <li> This metric scales raw edit distances into a floating point score
30   *         based upon the shortest of the two terms
31   *    <li> Transpositions of two adjacent codepoints are treated as primitive 
32   *         edits.
33   *    <li> Edits are applied in parallel: for example, "ab" and "bca" have 
34   *         distance 3.
35   *  </ul>
36   *  
37   *  NOTE: this class is not particularly efficient. It is only intended
38   *  for merging results from multiple DirectSpellCheckers.
39   */
40  public final class LuceneLevenshteinDistance implements StringDistance {
41    
42    /**
43     * Creates a new comparator, mimicing the behavior of Lucene's internal
44     * edit distance.
45     */
46    public LuceneLevenshteinDistance() {}
47  
48    @Override
49    public float getDistance(String target, String other) {
50      IntsRef targetPoints;
51      IntsRef otherPoints;
52      int n;
53      int d[][]; // cost array
54      
55      // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
56      // what LevenshteinDistance does, except cycling thru a ring of three 
57      // horizontal cost arrays... but this comparator is never actually used by 
58      // DirectSpellChecker, it's only used for merging results from multiple shards 
59      // in "distributed spellcheck", and it's inefficient in other ways too...
60  
61      // cheaper to do this up front once
62      targetPoints = toIntsRef(target);
63      otherPoints = toIntsRef(other);
64      n = targetPoints.length;
65      final int m = otherPoints.length;
66      d = new int[n+1][m+1];
67      
68      if (n == 0 || m == 0) {
69        if (n == m) {
70          return 0;
71        }
72        else {
73          return Math.max(n, m);
74        }
75      } 
76  
77      // indexes into strings s and t
78      int i; // iterates through s
79      int j; // iterates through t
80  
81      int t_j; // jth character of t
82  
83      int cost; // cost
84  
85      for (i = 0; i<=n; i++) {
86        d[i][0] = i;
87      }
88      
89      for (j = 0; j<=m; j++) {
90        d[0][j] = j;
91      }
92  
93      for (j = 1; j<=m; j++) {
94        t_j = otherPoints.ints[j-1];
95  
96        for (i=1; i<=n; i++) {
97          cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
98          // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
99          d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
100         // transposition
101         if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
102           d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
103         }
104       }
105     }
106     
107     return 1.0f - ((float) d[n][m] / Math.min(m, n));
108   }
109   
110   private static IntsRef toIntsRef(String s) {
111     IntsRef ref = new IntsRef(s.length()); // worst case
112     int utf16Len = s.length();
113     for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
114       cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
115     }
116     return ref;
117   }
118 }